In [1]:
# Import necessary modules
import os
import glob

import pandas as pd
import statsmodels.formula.api as smf

import plotly.graph_objects as go # for interactive plot with human defined labels
import plotly.express as px
In [2]:
# Load necessary data
path = "/Users/ZXue/src/speeding-up-sci-correlation/"
df = pd.read_table(os.path.join(path, "TARA_025-merged-only-those-in-both-full-TPM.tsv"))
df.head() # a look at the sample
Out[2]:
Gene DNA RNA
0 TOBG-MED-1076_1101 3.578633 12.992625
1 TOBG-MED-1076_1116 0.714860 4.037256
2 TOBG-MED-1076_1131 7.727037 5.454919
3 TOBG-MED-1076_1151 2.859440 15.572275
4 TOBG-MED-1076_1195 8.813051 12.379700
In [3]:
## Add KO ID to df
df_anno = pd.read_table(os.path.join(path, "Our-Kegg-annotations.tsv"))
df_anno.head()

df = df.merge(df_anno, on='Gene')
# remove genes that are not annotated with a KO ID
df.dropna()
Out[3]:
Gene DNA RNA KO_ID
0 TOBG-MED-1076_1101 3.578633 12.992625 K01940
1 TOBG-MED-1076_1116 0.714860 4.037256 K00962
2 TOBG-MED-1076_1131 7.727037 5.454919 K01696
3 TOBG-MED-1076_1151 2.859440 15.572275 K02990
6 TOBG-MED-1076_1298 2.923141 4.295652 K00347
... ... ... ... ...
24039 TOBG-MED-831_959 1.353264 7.915675 K00005
24040 TOBG-MED-831_960 8.061755 34.266264 K03696
24041 TOBG-MED-831_978 0.950462 1.150253 K03455
24042 TOBG-MED-831_98 0.479963 3.630331 K08309
24043 TOBG-MED-831_984 1.385115 3.352542 K05592

13165 rows × 4 columns

In [4]:
## Add pathway info to df
df_path = pd.read_table(os.path.join(path, "kegg-groups-Lee-2017.tsv"))
df_path.head()

df = df.merge(df_path, on='KO_ID') ## only merge overlapping KO IDs!! USER BEAWARE!
df.head()
Out[4]:
Gene DNA RNA KO_ID Category1
0 TOBG-MED-1076_1101 3.578633 12.992625 K01940 Arginine/proline metabolism
1 TOBG-MED-590_1583 23.569356 25.271020 K01940 Arginine/proline metabolism
2 TOBG-MED-591_447 93.122907 14.655755 K01940 Arginine/proline metabolism
3 TOBG-MED-594_2486 63.978247 11.589350 K01940 Arginine/proline metabolism
4 TOBG-MED-662_398 1.237459 7.987083 K01940 Arginine/proline metabolism
In [5]:
# Make the figure with scatter dots
tracedot = go.Scatter(x=df['DNA'], # User should change this based on dataframe columns
                      y=df['RNA'], # User defined 
                      mode='markers', # for scatter plot
                      text=df['KO_ID'], # hover text
                      #color=df['Category1'],
                      name='Data') # User defined
In [6]:
# Initialise and fit linear regression model using `statsmodels`
### tutorial https://towardsdatascience.com/introduction-to-linear-regression-in-python-c12a072bedf0
model = smf.ols('RNA ~ DNA', data=df)
model = model.fit()
# get linear regression parameters
model.params
Out[6]:
Intercept    13.730471
DNA           0.441281
dtype: float64
In [7]:
# get linear regression R^2 
model.rsquared
Out[7]:
0.024103814649317212
In [8]:
# get linear regression p-value
model.pvalues
Out[8]:
Intercept    2.117803e-05
DNA          9.853081e-38
dtype: float64
In [9]:
# linear regression equation
max_val = df['DNA'].max()
df2 = [[0, model.params[0]],[max_val, model.params[1]*max_val+model.params[0]]]
df2 = pd.DataFrame(df2, columns = ['xi', 'y']) 
 

# Make the line graph 
trace_ln = go.Scatter(
                  x=df2['xi'],
                  y=df2['y'],
                  mode='lines',
                  name='Fit'
                  )
In [10]:
# Make the text string indicating the linear regression model

eq_txt = "$R^2 = {:.3f}, \n y = {:.3f}x+{:.3f}$".format(model.rsquared, model.params[1], model.params[0])

annotation = go.layout.Annotation(
                  x=df['DNA'].max(),
                  y=df['RNA'].max(),
                  text=eq_txt,
                  showarrow=False,
                  font=go.Font(size=16)
                  )
layout = go.Layout(annotations=[annotation])


# Tie everything together to a figure
fig = go.Figure([tracedot,trace_ln], layout=layout)


# Edit the figure layout 
fig.update_layout(title='Gene correlations between metagenomics and metatranscriptomic resutls',
                  xaxis_title='Count (Metagenomics)',
                  yaxis_title='Count (Metatranscriptomics)')

fig.show()
/Users/ZXue/miniconda3/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:329: DeprecationWarning:

plotly.graph_objs.Font is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Font
  - plotly.graph_objs.layout.hoverlabel.Font
  - etc.


In [11]:
### go trace plotting can not set color to dots by category 
## so I am now trying to set color with px plotting
## and add line and text through adding trace
figdot = px.scatter(df, x="DNA", y="RNA", color="Category1", hover_data=['KO_ID'])

# a "dummy" scatter plot trace that is used to carry the text 
trace_txt = go.Scatter(   
    x=[df['DNA'].max()], ##########
    y=[df['RNA'].max()],
    mode='markers+text',
    text=eq_txt,
    marker=dict(size = 1),
    name='linear regression model',
    textposition='bottom left'
)

# add a trace to represent 1:1 regression (y=x)
df3 = [[0, 0],[max_val, max_val]]
df3 = pd.DataFrame(df3, columns = ['xi', 'y']) 
trace_yx = go.Scatter(
                  x=df3['xi'],
                  y=df3['y'],
                  mode='lines',
                  line = dict(color='grey', width=4, dash='dot'),
                  name='1:1 line'
                  )


figdot.add_trace(trace_ln).add_trace(trace_txt).add_trace(trace_yx)
#go.Figure(figdot, layout=layout) # why is the text in layout settings not showing up?!
figdot.show()
In [18]:
'''An example of interesting results: K01574 was expressed at very high level in metatrancriptomic results but its "copy number"
is relatively low by metagenomic measurements. '''

'''This KO ID is linked to both carbohydrate and lipid metabolism, suggesting robust microbial activities
within in the sampled Tara oceam specimen.'''

df[df['KO_ID']  == 'K01574']
Out[18]:
Gene DNA RNA KO_ID Category1
6619 TOBG-MED-731_1009 135.490385 14054.812733 K01574 Fatty acid metabolism
6620 TOBG-MED-731_987 106.409456 1420.631522 K01574 Fatty acid metabolism
In [42]:
## To make a web-based application using plotly?
Out[42]:
42.77276000869718
In [50]:
print(eq_txt)
$R^2 = 0.024, 
 y = 0.441x+13.730$